001 /* 002 * ScoringMatrix.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 import java.io.Reader; 035 import java.io.StreamTokenizer; 036 import java.io.IOException; 037 038 /** 039 * This class implements a scoring scheme based on a substitution matrix. It is useful 040 * to represent PAM and BLOSUM family of amino acids scoring matrices. Its constructor 041 * loads such matrices from a file (or any other character stream). The following is an 042 * extract of a BLOSUM62 scoring matrix file: 043 * <CODE><BLOCKQUOTE><PRE> 044 * A R N D C Q E G H I L K M F P S T W Y V B Z X * 045 * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 046 * R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 047 * ... 048 * B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 049 * Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 050 * X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 051 * * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 052 * </PRE></BLOCKQUOTE></CODE> 053 * 054 * <P>Matrices are expected to follow this format. They must have one row an one column 055 * for each defined character (not necessarily in the same order). Each row and column 056 * must start with a distinct character (no repetition) and all row characters must have a 057 * correspondent column, and vice versa.</P> 058 * 059 * <P>Value at position (i,j) represent the score of substituting character of row i for 060 * character of column j. Insertion penalties are specified by the last row while deletion 061 * penalties must be located at the last column (both represented by the special character 062 * defined by the <CODE>INDEL_CHAR</CODE> constant). Note that it only supports an 063 * additive gap cost function. In case any of this rules are not followed, an 064 * {@linkplain InvalidScoringMatrixException} exception is raised by the constructor.</P> 065 * 066 * <P>If a scoring operation (substitution, insertion or deletion) involves a character 067 * not found in the matrix, an exception is raised.</P> 068 * 069 * @author Sergio A. de Carvalho Jr. 070 * @see InvalidScoringMatrixException 071 */ 072 public class ScoringMatrix extends ScoringScheme 073 { 074 /** 075 * The character that indicates the row and column for insertion and deletion 076 * penalties in the matrix. 077 */ 078 protected static final char INDEL_CHAR = '*'; 079 080 /** 081 * The character used to start a comment line in the scoring matrix file. 082 */ 083 protected static final char COMMENT_CHAR = '#'; 084 085 /** 086 * Stores matrix column headers in the order they were found. 087 */ 088 protected String col_codes; 089 090 /** 091 * Stores matrix row headers in the order they were found. 092 */ 093 protected String row_codes; 094 095 /** 096 * Stores values for each operation (substitution, insertion or deletion) defined by 097 * this matrix. 098 */ 099 protected int matrix[][]; 100 101 /** 102 * Dimension of the (squared) matrix. 103 */ 104 protected int dimension; 105 106 /** 107 * The maximum absolute score that this matrix can return for any substitution, 108 * deletion or insertion. 109 */ 110 protected int max_absolute_score; 111 112 /** 113 * Creates a new instance of a substitution matrix loaded from the character stream. 114 * The case of characters is significant when subsequently computing their score. 115 * 116 * @param input character stream from where the matrix is read 117 * @throws IOException if an I/O operation fails when reading from input 118 * @throws InvalidScoringMatrixException if the matrix does not comply with the 119 * specification 120 */ 121 public ScoringMatrix (Reader input) 122 throws IOException, InvalidScoringMatrixException 123 { 124 this (input, true); 125 } 126 127 /** 128 * Creates a new instance of a substitution matrix loaded from the character stream. 129 * If <CODE>case_sensitive</CODE> is <CODE>true</CODE>, the case of characters is 130 * significant when subsequently computing their score; otherwise the case is 131 * ignored. 132 * 133 * @param input character stream from where the matrix is read 134 * @param case_sensitive <CODE>true</CODE> if the case of characters must be 135 * @throws IOException if an I/O operation fails when reading from input 136 * @throws InvalidScoringMatrixException if the matrix does not comply with the 137 * specification 138 */ 139 public ScoringMatrix (Reader input, boolean case_sensitive) 140 throws IOException, InvalidScoringMatrixException 141 { 142 super (case_sensitive); 143 144 StreamTokenizer in; 145 StringBuffer buf = new StringBuffer(); 146 int row, col, max_abs = 0; 147 char c; 148 149 // create a stream tokenizer on top of the input 150 // stream and set the COMMENT_CHAR as the comment character 151 in = new StreamTokenizer(input); 152 in.commentChar(COMMENT_CHAR); 153 154 // consider ends of line when reading the first row 155 in.eolIsSignificant(true); 156 157 // skip blank lines (if any) 158 for (in.nextToken(); in.ttype == StreamTokenizer.TT_EOL; in.nextToken()); 159 160 // read first row: column character codes 161 while ((in.ttype != StreamTokenizer.TT_EOF) && 162 (in.ttype != StreamTokenizer.TT_EOL)) 163 { 164 if (in.ttype == StreamTokenizer.TT_WORD) 165 { 166 if (in.sval.length() > 1) 167 throw new InvalidScoringMatrixException 168 ("Column headers must have one-character only."); 169 170 buf.append(in.sval.charAt(0)); 171 } 172 else if (in.ttype == INDEL_CHAR) 173 { 174 buf.append(INDEL_CHAR); 175 } 176 else 177 { 178 throw new InvalidScoringMatrixException("Column headers must be " + 179 "one-character codes or the special character '" + INDEL_CHAR + "'."); 180 } 181 182 in.nextToken(); 183 } 184 185 // convert everything to upper case if it's not case sensitive 186 if (case_sensitive) 187 col_codes = buf.toString(); 188 else 189 col_codes = buf.toString().toUpperCase(); 190 191 dimension = col_codes.length(); 192 193 // check if there's a column for deletion penalties 194 if (col_codes.indexOf (INDEL_CHAR) == -1) 195 throw new InvalidScoringMatrixException 196 ("Matrix have no column for deletion penalties."); 197 198 // check if there is at least one character code (besides the INDEL char) 199 if (dimension < 2) 200 throw new InvalidScoringMatrixException 201 ("Matrix must have at least one column with a character code."); 202 203 // check for repeated column codes 204 for (int i = 0; i < dimension; i++) 205 if (col_codes.indexOf(col_codes.charAt(i),i+1) > i) 206 throw new InvalidScoringMatrixException 207 ("Columns must have distinct one-character codes."); 208 209 // allocate matrix 210 matrix = new int[dimension][dimension]; 211 212 // reset buffer 213 buf.delete (0, dimension); 214 215 // from now on, ignore ends of line 216 in.eolIsSignificant(false); 217 if (in.ttype == StreamTokenizer.TT_EOL) in.nextToken(); 218 219 // read rest of matrix (one line for each character, but 220 // not necessarily in the same order as the columns) 221 for (row = 0; row < dimension && in.ttype != StreamTokenizer.TT_EOF; row++) 222 { 223 // start reading the line: the character code must come first 224 if (in.ttype == StreamTokenizer.TT_WORD) 225 { 226 if (in.sval.length() > 1) 227 throw new InvalidScoringMatrixException 228 ("Codes must have one character only."); 229 230 buf.append(in.sval.charAt(0)); 231 } 232 else if (in.ttype == INDEL_CHAR) 233 { 234 buf.append(INDEL_CHAR); 235 } 236 else 237 { 238 throw new InvalidScoringMatrixException ("Rows must start with an" + 239 " one-character code or the special character '" + INDEL_CHAR + "'."); 240 } 241 242 // now, the set of values 243 for (col = 0; col < dimension; col++) 244 { 245 // start reading the values 246 if (in.nextToken() != StreamTokenizer.TT_NUMBER) 247 throw new InvalidScoringMatrixException 248 ("Invalid value at row " + (row+1) + ", column " + (col+1) + "."); 249 250 matrix[row][col] = (int) in.nval; 251 252 if (Math.abs(matrix[row][col]) > max_abs) 253 max_abs = Math.abs(matrix[row][col]); 254 } 255 256 in.nextToken(); 257 } 258 259 // convert everything to upper case if it's not case sensitive 260 if (case_sensitive) 261 row_codes = buf.toString(); 262 else 263 row_codes = buf.toString().toUpperCase(); 264 265 // check if read as many rows as columns 266 if (row_codes.length() != dimension) 267 throw new InvalidScoringMatrixException 268 ("Matrix must have as many rows as columns."); 269 270 // check if there's a row for insertion penalties 271 if (row_codes.indexOf(INDEL_CHAR) == -1) 272 throw new InvalidScoringMatrixException 273 ("Matrix have no row for insertion penalties."); 274 275 // check for repeated row codes 276 for (int i = 0; i < dimension; i++) 277 if (row_codes.indexOf(row_codes.charAt(i),i+1) > i) 278 throw new InvalidScoringMatrixException 279 ("Rows must have distinct one-character codes."); 280 281 // check if all rows have a corresponding column 282 for (int i = 0; i < dimension; i++) 283 if (col_codes.indexOf(c = row_codes.charAt(i)) == -1) 284 throw new InvalidScoringMatrixException 285 ("There is no corresponding column for row character '" + c + "'."); 286 287 // store the maximum absolute value found 288 this.max_absolute_score = max_abs; 289 } 290 291 /** 292 * Returns the score of a substitution of character <CODE>a</CODE> for character 293 * <CODE>b</CODE> according to this scoring matrix. 294 * 295 * @param a first character 296 * @param b second character 297 * @return score of a substitution of character <CODE>a</CODE> for <CODE>b</CODE> 298 * @throws IncompatibleScoringSchemeException if this substitution is not defined 299 */ 300 public int scoreSubstitution (char a, char b) 301 throws IncompatibleScoringSchemeException 302 { 303 int r,c; 304 305 if (case_sensitive) 306 { 307 r = row_codes.indexOf(a); 308 c = col_codes.indexOf(b); 309 } 310 else 311 { 312 r = row_codes.indexOf(Character.toUpperCase(a)); 313 c = col_codes.indexOf(Character.toUpperCase(b)); 314 } 315 316 if (r < 0 || c < 0) 317 throw new IncompatibleScoringSchemeException ("Substitution of character " + 318 a + " for " + b + " is not defined."); 319 320 return matrix[r][c]; 321 } 322 323 /** 324 * Returns the score of an insertion of character <CODE>a</CODE> according to this 325 * scoring matrix. 326 * 327 * @param a character to be inserted 328 * @return score of insertion of <CODE>a</CODE> 329 * @throws IncompatibleScoringSchemeException if this character is not recognised 330 */ 331 public int scoreInsertion (char a) throws IncompatibleScoringSchemeException 332 { 333 return scoreSubstitution (INDEL_CHAR, a); 334 } 335 336 /** 337 * Returns the score of a deletion of character <CODE>a</CODE> according to this 338 * scoring matrix. 339 * 340 * @param a character to be deleted 341 * @return score of deletion of <CODE>a</CODE> 342 * @throws IncompatibleScoringSchemeException if this character is not recognised 343 */ 344 public int scoreDeletion (char a) throws IncompatibleScoringSchemeException 345 { 346 return scoreSubstitution (a, INDEL_CHAR); 347 } 348 349 /** 350 * Tells whether this scoring scheme supports partial matches, which it does, although 351 * a particular scoring matrix loaded by this instace might not. A partial match is 352 * a situation when two characters are not equal but, for any reason, are regarded 353 * as similar by this scoring scheme, which then returns a positive score value. This 354 * is common for amino acid scoring matrices. 355 * 356 * @return always return <CODE>true</CODE> 357 */ 358 public boolean isPartialMatchSupported () 359 { 360 return true; 361 } 362 363 /** 364 * Returns the maximum absolute score that this scoring scheme can return for any 365 * substitution, deletion or insertion. 366 * 367 * @return maximum absolute score that can be returned 368 */ 369 public int maxAbsoluteScore () 370 { 371 return max_absolute_score; 372 } 373 374 /** 375 * Returns a String representation of this scoring matrix. 376 * 377 * @return a String representation of this scoring matrix 378 */ 379 public String toString () 380 { 381 int row, col; 382 383 StringBuffer buf = new StringBuffer(); 384 385 // column numbers 386 buf.append("Scoring matrix:\n\t"); 387 for (col = 0; col < dimension; col++) 388 { 389 buf.append("\t" + col); 390 } 391 buf.append("\n\t"); 392 393 // column headers 394 for (col = 0; col < dimension; col++) 395 { 396 buf.append('\t'); 397 buf.append(col_codes.charAt(col)); 398 } 399 400 // rest of matrix 401 for (row = 0; row < dimension; row++) 402 { 403 // row number and code 404 buf.append("\n" + row + "\t" + row_codes.charAt(row)); 405 406 for (col = 0; col < dimension; col++) 407 { 408 buf.append('\t'); 409 buf.append(matrix[row][col]); 410 } 411 } 412 413 return buf.toString(); 414 } 415 }